#' ---
#' title: "Campaign Communication and Legislative Leadership (PSRM)"
#' subtitle: "06_merge_sources.R"
#' author: "Authors: Stefan Mueller and Naofumi Fujimura"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---


# load packages
library(rio)         # CRAN v0.5.29
library(readstata13) # CRAN v0.10.1
library(dplyr)       # CRAN v1.1.2
library(stringr)     # CRAN v1.5.0

# print output of sessionInfo()
sessionInfo()

# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2024-01-31")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2024-01-31")
# More details are available at: https://groundhogr.com/using/


# Load dataset with legislative posts
dat_all <- readRDS("data_positions_all.rds")

dat_all <- dat_all |> 
    mutate(policy_area_merge = str_replace_all(policy_area_merge, "labour", "labor")) |> 
    mutate(policy_area_harmonised = str_replace_all(policy_area_harmonised, "Labour", "Labor"))



# rename id_smith to pid
dat_all$pid <- dat_all$id_smith

table(dat_all$policy_area_harmonised)

# load Reed and Smith data on candidates
dat_reed_smith <- readstata13::read.dta13("Reed-Smith-JHRED-CANDIDATES.dta")

# exclude by-elections
dat_reed_smith <- dat_reed_smith |> 
    filter(result != 5)

# create additional variables based on election results
dat_reed_smith <- dat_reed_smith |> 
    arrange(pid, year) |> 
    group_by(pid) |> 
    mutate(votes_candidate = ku_vote,
           votes_district = ku_totvote,
           vote_share_candidate = votes_candidate / votes_district,
           elected_lead = lead(result),
           elected = result) |> 
    dplyr::select(starts_with("vote"), everything())


# get unique names for manual merging process conducted by RAs
# dat_reed_smith_names_years <- dat_reed_smith |> 
#   select(pid, year, name_jp, elected) |> 
#   filter(year > 2000  & elected != 0) |> 
#   select(pid, name_jp) |> 
#   unique()
# write_csv(dat_reed_smith_names_years, "data/data_names_reed_smith.csv")


# load classified manifesto salience
dat_manifestos <- readRDS("data_policy_areas_manifesto_bert.rds")

dat_manifestos <- dat_manifestos |> 
    mutate(predicted_class = dplyr::recode(predicted_class, "Committees on Cabinet" = "Cabinet")) |> 
    mutate(predicted_class = str_replace_all(predicted_class, "Labour", "Labor"))


# recode policy areas
dat_manifestos <- dat_manifestos |> 
    mutate(policy_area_merge = str_to_lower(str_squish(predicted_class))) |> 
    mutate(policy_area_merge = str_replace_all(policy_area_merge, " ", "")) |> 
    mutate(policy_area_merge = str_replace_all(policy_area_merge, ",", ""))

dat_manifestos$filename <- dat_manifestos$manifesto_id

# remove observations without filenames
dat_all_relevant <- filter(dat_all, !is.na(filename))

table(dat_all_relevant$policy_area_merge)
table(dat_manifestos$policy_area_merge)

# create filename variable for merging
dat_manifestos$filename <- dat_manifestos$manifesto_id

# merge datasets
dat_merged <- left_join(dat_all_relevant, 
                        dat_manifestos, 
                        by = c("filename",
                               "policy_area_merge"))


# check what proportion of manifestos is available
dat_candidates_election <- dat_merged |> 
    ungroup() |> 
    dplyr::select(pid = id_smith, year, filename) |> 
    unique()


## From Reed and Smith codebook (p. 5)

# 0: Lost the election.
# 
# 1: Was elected in MMD or SMD in a general election.
# 
# 2: Ran in SMD but lost; was elected via PR list (a “zombie”).
# 
# 3: Was elected as a pure PR list candidate.
# 
# 4: Was elected midterm due to the resignation or death of sitting MP, either because the candidate was the runner-up within 3 months of the election under SNTV, or was in the next position on the party list under MMM (so-called kuriage tōsen). 
# 5: Was elected in a by-election.

# recode four candidates whose parties are not assigned correctly

dat_reed_smith_rec <- dat_reed_smith |> 
    mutate(party_en = case_when(
        pid == "10715" & year == "2003" ~ "LDP",
        pid == "10888" & year == "2005" ~ "LDP",
        pid == "11830" & year == "2005" ~ "LDP",
        pid == "15819" & year == "2005" ~ "LDP",
        .default = party_en))


# get "missing" candidates
dat_missing <- left_join(dat_reed_smith_rec, dat_candidates_election,
                         by = c("pid", "year")) |> 
    dplyr::select(filename, everything()) |> 
    filter(between(year, 2003, 2014)) |> 
    filter(year %in% c(2003, 2005, 2012, 2014) & party_en %in% c("LDP") | 
               year == 2009 & party_en == "DPJ") |> 
    filter(result != 0) |> 
    filter(!result %in% c(4, 5)) |> # NOT by-election or midterm
    filter(result %in% c(1, 2)) # elected MMD/SMD OR Zombie

dat_missing |> 
    filter(pid == "15819" & year == "2005") |> 
    dplyr::select(party_en, name_jp, filename)


nrow(dat_missing)

table(dat_missing$result)

# create binary indicator for availability of manifesto
dat_missing <- dat_missing |> 
    mutate(manifesto_available = ifelse(!is.na(filename), 1, 0))

table(dat_missing$year,
      dat_missing$party_en)

# filter candidates whose manifesto is unavailable
dat_missing_1 <- filter(dat_missing, manifesto_available == 0) |> 
    dplyr::select(pid, name_jp, party_en, year) |> 
    arrange(year, name_jp)
nrow(dat_missing_1)

# calculate proportions of available manifestos for subset of relevant legislators
dat_available_election <- dat_missing |> 
    group_by(year, party_en) |> 
    summarise(available_manifestos = round(100 * mean(manifesto_available, na.rm = TRUE), 1))

# print data frame
dat_available_election

# save for descriptive plot created in analysis script
write.csv(dat_available_election, "data_candidates_available.csv",
          fileEncoding = "utf-8",
          row.names = FALSE)


# merge metadata with dat_merged
dat_merged_meta <- left_join(dat_merged, dat_reed_smith, 
                             by = c("pid", "year"))


# load dataset with portfolio importance
dat_portfolio <- read.csv("data_importance_portfolios.csv",
                          fileEncoding = "utf-8",
                          stringsAsFactors = FALSE)

dat_portfolio$year <- as.numeric(dat_portfolio$year)
dat_portfolio$policy_area <- str_trim(dat_portfolio$committee)

# rename policy areas
dat_portfolio <-  dat_portfolio |> 
    mutate(policy_area_merge = str_replace_all(policy_area, "Labour", "Labor")) |> 
    mutate(policy_area_merge = str_to_lower(str_squish(policy_area_merge))) |> 
    mutate(policy_area_merge = str_replace_all(policy_area_merge, " ", "")) |> 
    mutate(policy_area_merge = str_replace_all(policy_area_merge, ",", ""))

table(dat_merged_meta$policy_area_merge)


# create dataframe for regression model
dat_reg <- left_join(dat_merged_meta, dat_portfolio,
                     by = c("year", "policy_area_merge")) |> 
    dplyr::select(portfolio_survey, committee, 
                  importance_portfolio =  mean_importance, 
                  everything()) |> 
    ungroup()

# transform type_posts to title case
dat_reg$type_posts <- str_to_title(dat_reg$type_posts)

# recode cabinet posts
dat_reg <- dat_reg |>
    ungroup() |> 
    mutate(type_posts = dplyr::recode(
        type_posts,
        "Cabinet Posts" = "Cabinet Post",
        "Merged" = "Legislative Post (Combined)",
        "Ministers" = "Ministerial Post",
        "Committee Membership" = "Committee Post",
        "Party Policy Division" = "Party Policy Division Post"))

table(dat_reg$type_posts, useNA = "always")


# recode policy areas into Pekkannen et al. (2006) scheme
dat_reg <- dat_reg |> 
    mutate(area_pekkanen  = 
               case_when(
                   str_detect(policy_area_harmonised, "Agriculture") ~ "Distributive",
                   str_detect(policy_area_harmonised, "Financial") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Security") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Cabinet") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Trade") ~ "Distributive",
                   str_detect(policy_area_harmonised, "Education") ~ "Public Goods",
                   str_detect(policy_area_harmonised, "Environment") ~ "Public Goods",
                   str_detect(policy_area_harmonised, "Welfare") ~ "Public Goods",
                   str_detect(policy_area_harmonised, "Foreign") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Internal") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Infrastructure") ~ "Distributive")) |> 
    ungroup()

# create variables merging policy posts and areas
dat_reg <- dat_reg |> 
    mutate(policy_area_type_post = paste(policy_area_harmonised, type_posts, sep = "__")) |> 
    mutate(pekkanen_type_post = paste(area_pekkanen, type_posts, sep = "__"))


# make female a factor variable
dat_reg$female <- factor(dat_reg$female)

# make party a factor variable
summary(dat_reg$vote_margin)

table(dat_reg$totcwins)

## From Smith and Reed codebook
# TOTCWINS = the total number of times since 1947 (election time t inclusive) 
# that the candidate has ever won in an election, including by-elections, 
# and regardless of district or jiban. 
# This variable is NOT exactly the same as the total number of terms
# served because a few candidates (such as Machimura Nobutaka in 
# Hokkaido 5 th District in 2010) resigned their seats midterm and then
# were re-elected in a by-election. 
# This must be calculated using PID, RESULT, and LEGIS after dropping losing candidates. 
#Previous wins (election time t – 1) must also be calculated.

dat_reg <- dat_reg |> 
    mutate(won_before_times = totcwins - 1) |> # get previous (!) wins: 0 for those who were totcwins = 1
    mutate(won_before_times_max9 = ifelse(won_before_times >= 9, 9, won_before_times)) |> # get previous wins
    mutate(won_before_dummy = ifelse(won_before_times >= 1, "Yes", "No")) |> 
    mutate(won_before_at_least_2 = ifelse(won_before_times >= 2, "Yes", "No")) |> 
    mutate(won_before_times_max9_factor = factor(won_before_times_max9))

# recode election results and number of terms
dat_reg <- dat_reg |> 
    mutate(result = as.character(result)) |> 
    mutate(type_elected = dplyr::recode(result, 
                                        "0" = "Lost",
                                        "1" = "Elected in SMD",
                                        "2" = "Zombie",
                                        "3" = "Elected as pure PR List Candidate")) |> 
    mutate(number_of_terms = as.character(ifelse(totcwins >= 10, "10", totcwins))) |> 
    mutate(number_of_terms = dplyr::recode(number_of_terms, "10" = "10+"))


# get lagged dependent variable (position in same area)
dat_reg <- dat_reg |> 
    arrange(id_smith, policy_area_harmonised, type_posts,
            year) |> 
    group_by(id_smith, policy_area_harmonised, type_posts) |> 
    mutate(position_dummy_lag = lag(position_dummy)) |> 
    dplyr::select(position_dummy, position_dummy_lag, everything()) |> 
    ungroup()

# get count of statements, get 0 if NA (not part of classified sentences)
dat_reg <- dat_reg |> 
    mutate(count_bert = ifelse(is.na(n_sentences_class_bert), 0, n_sentences_class_bert))

dat_reg <- dat_reg |>
    ungroup() |> 
    mutate(type_posts = str_replace_all(type_posts, "Merged", "Legislative Post (Combined)")) |> 
    mutate(type_posts = str_replace_all(type_posts, "Ministers", "Minister")) |> 
    group_by(type_posts) |> 
    ungroup() |> 
    # relevant = only policy-area related sentences
    mutate(prop_policyarea_relevant_bert_no_na = 
               ifelse(is.na(prop_policyarea_relevant_bert), 
                      0, prop_policyarea_relevant_bert)) |>
    mutate(prop_policyarea_bert_no_na = ifelse(is.na(prop_policyarea_bert), 0,
                                               prop_policyarea_bert))


# create new variable indicating year of election and type of post
dat_reg <- dat_reg |> 
    mutate(year_post = paste(year, type_posts, sep = "_"))

# change names of policy areas
dat_reg <- dat_reg |> 
    mutate(policy_area_harmonised = str_replace_all(policy_area_harmonised, " and Industry", ", and Industry")) |> 
    mutate(policy_area_harmonised = str_remove(policy_area_harmonised, "Committees on ")) |> 
    mutate(policy_area_harmonised = str_replace_all(policy_area_harmonised, "Labour", "Labor"))

# candidates who lost - check what is correct here
# their vote margin was very close to 50%
# Reed and Smith code them as "lost election"
dat_reg |> 
    filter(result == 0) |> 
    dplyr::select(filename, pid, name_jp) |> 
    unique()

# check no candidate who was Elected as pure PR List Candidate included
dat_reg |> 
    filter(type_elected == "Elected as pure PR List Candidate") |> 
    dplyr::select(filename, pid, name_jp, voteshare) |> 
    unique()

# Remove two candidates who did not get elected
# or are coded as elected as pure PR List candidate (=0)
# Exclude Judicial Affairs observations since not included in analysis
dat_reg_store <- dat_reg |>
    filter(result != 0) |> # exclude candidates who lost
    filter(type_elected != "Elected as pure PR List Candidate") |>
    filter(policy_area_harmonised != "Justice Affairs")

# before filtering (should be 1972)
length(unique(dat_reg$filename))

# after filtering (should be 1970 since two unelected candidates are removed)
length(unique(dat_reg_store$filename))

dat_reg_included <- dat_reg_store |> 
    dplyr::select(filename) |> 
    unique()

# save manifestos included in the analysis for Wordscores and keyness
write.csv(dat_reg_included, "data_manifestos_included.csv",
          fileEncoding = "utf-8", row.names = FALSE)

# save data for next steps (Wordfish scaling and analysis)
saveRDS(dat_reg_store, "data_analysis_bert.rds")
